## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Warning in library(stringr): package 'stringr' already present in search()
## now dyn.load("/Library/Frameworks/R.framework/Versions/3.6/Resources/library/Matrix/libs/Matrix.so") ...
## now dyn.load("/Library/Frameworks/R.framework/Versions/3.6/Resources/library/SnowballC/libs/SnowballC.so") ...
## now dyn.load("/Library/Frameworks/R.framework/Versions/3.6/Resources/library/tokenizers/libs/tokenizers.so") ...
## Loading required package: jiebaRD
## now dyn.load("/Library/Frameworks/R.framework/Versions/3.6/Resources/library/jiebaR/libs/jiebaR.so") ...
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
將會產生兩個data frames, posts and comments。
df<-posts%>%
left_join(comments,by = "plink")%>%
select(commentor,ccontent,ptitle)%>%
mutate(comment_id = row_number())
df針對posts 跟 comments 兩個資料框合併並做處理
cutter <- worker()
segment_not <- c("蔡英文", "韓國瑜", "國民黨", "民進黨", "綠營", "藍營","韓總","韓粉","柯文哲")
new_user_word(cutter, segment_not)## [1] TRUE
將資料框做斷詞處理
unnested_df<-df %>%
mutate(word = purrr::map(ccontent, function(x)segment(x, cutter))) %>%
unnest(word) %>%
filter(!is.na(word)) %>%
filter(!(word %in% stopWords$word)) %>%
filter(!str_detect(word, "[a-zA-Z0-9]+"))
unnested_df將斷詞後的資料框,查找留言是否有出現以下規則,並將其出現次數變成新的Feature,以利我們後續用PCA來降維
featured_df <- unnested_df%>%
group_by(commentor,ccontent,ptitle,comment_id)%>%
mutate(f1=sum(str_detect(word, "韓國瑜")))%>%
mutate(f2=sum(str_detect(word, "韓粉")))%>%
mutate(f3=sum(str_detect(word, "韓")))%>%
mutate(f4=sum(str_detect(word, "韓總")))%>%
mutate(f5=sum(str_detect(word, "蔡英文")))%>%
mutate(f6=sum(str_detect(word, "國民黨")))%>%
mutate(f7=sum(str_detect(word, "民進黨")))%>%
mutate(f8=sum(str_detect(word, "綠營")))%>%
mutate(f9=sum(str_detect(word, "藍營")))%>%
mutate(f10=sum(str_detect(word, "蔡總統")))%>%
mutate(f11=sum(str_detect(word, "韓市長")))%>%
mutate(f13=sum( str_detect(word, "韓國瑜")&& str_detect(word, "好|讚|棒|厲害|強|負責|支持|投給") ))%>%
mutate(f14=sum( str_detect(word, "蔡英文")&& str_detect(word, "好|讚|棒|厲害|強|負責|支持|投給") ))%>%
mutate(f15=sum( str_detect(word, "韓國瑜")&& str_detect(word, "不好|爛|糟|垃圾|強|白痴|笨蛋|不支持|不投給|下台") ))%>%
mutate(f16=sum( str_detect(word, "蔡英文")&& str_detect(word, "不好|爛|糟|垃圾|強|白痴|笨蛋|不支持|不投給|下台") ))%>%
mutate(f17=sum( str_detect(word, "國民黨|藍營")&& str_detect(word, "好|讚|棒|厲害|強|負責|支持|投給") ))%>%
mutate(f18=sum( str_detect(word, "國民黨|藍營")&& str_detect(word, "不好|爛|糟|垃圾|強|白痴|笨蛋|不支持|不投給|下台") ))%>%
mutate(f19=sum( str_detect(word, "民進黨|綠營")&& str_detect(word, "好|讚|棒|厲害|強|負責|支持|投給") ))%>%
mutate(f20=sum( str_detect(word, "民進黨|綠營")&& str_detect(word, "不好|爛|糟|垃圾|強|白痴|笨蛋|不支持|不投給|下台") ))%>%
ungroup()將重複的row移開,變成乾淨的資料框,以利我們後續用PCA來降維
將前面自己定義的那些Feature丟入PCA做降維
畫出各個PC的variance
將PC1排序,並發現到PC1分數越高是挺韓者,而分數越低是相對黑韓,故我們輸出分數最高前五位的回文內容與POST標題
我的做法所找出來的這些人,就文字來看確實就是挺韓者, 而我會有把握的原因為:我已經將主要會影響到是否挺韓的因素,都變成新的feature了, 這些規則一共有20個,我再透過PCA來找出最重要影響是否挺韓的主成份,並依此排序找出分數最高前五位的回文內容與POST標題。 故這樣的邏輯算是蠻有把握找出最挺韓的5個回文者。